knitr::opts_knit$set(root.dir = ".")
library(cowplot) # plot_grid()
library(dplyr) # left_join()
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # ggplot()
library(gridExtra) # grid.arrange()
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(parallel) # detectCores()
library(rtracklayer) # import()
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
library(scCustomize) # Merge_Seurat_List()
## Loading required package: Seurat
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'sp'
## The following object is masked from 'package:IRanges':
##
## %over%
##
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:GenomicRanges':
##
## intersect
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following object is masked from 'package:IRanges':
##
## intersect
## The following object is masked from 'package:S4Vectors':
##
## intersect
## The following object is masked from 'package:BiocGenerics':
##
## intersect
## The following objects are masked from 'package:base':
##
## intersect, t
## scCustomize v2.1.2
## If you find the scCustomize useful please cite.
## See 'samuel-marsh.github.io/scCustomize/articles/FAQ.html' for citation info.
library(Seurat) # Read10X_h5()
library(stringr) # str_match()
# variables
out <- "../../results/pass_1/all_clusters/"
sample_order <- c("E3.M","E3.F","E4.M","E4.F")
sample_colors <- c("#26946A","#1814A1","#EAC941","#DF5F00")
sample_order2 <- c("Male E3", "Male E4", "Female E3", "Female E4")
isoform_order <- c("E4","E3")
isoform_colors <- c("darkgray","cornflowerblue")
sex_order <- c("Male","Female")
sex_colors <- c("darkgray","purple")
# thresholds
nCount.min <- 500
nCount.max <- 20000
nFeature.min <- 250
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 50
hb.cutoff <- 3
# functions
source("../../../functions/Kennedi_single_cell_functions.R")
saveToPDF <- function(...) {
d = dev.copy(pdf,...)
dev.off(d)
}
prefix <- "../../counts/"
suffix <- "/outs/filtered_feature_bc_matrix.h5"
if (file.exists("../../rObjects/raw_h5.rds")) {
mouse <- readRDS("../../rObjects/raw_h5.rds")
} else {
# individual sample objects
E3.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_F",suffix)))
E3.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_M",suffix)))
E4.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_F",suffix)))
E4.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_M",suffix)))
# merge objects
mouse <- merge(x = E3.F,
y = c(E3.M,E4.F,E4.M),
add.cell.ids = c("E3.F","E3.M","E4.F","E4.M"),
project = "Mouse Meningeal Dura scRNAseq")
# cleanup and save
remove(E3.F, E3.M, E4.F, E4.M)
saveRDS(mouse, "../../rObjects/raw_h5.rds")
}
# preview
mouse
## An object of class Seurat
## 32285 features across 33388 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
## 4 layers present: counts.1, counts.2, counts.3, counts.4
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
genes <- readRDS("../../rObjects/annotation.rds")
} else {
gtf.file <- "../../refs/mouse_genes.gtf"
genes <- rtracklayer::import(gtf.file)
genes <- as.data.frame(genes)
genes <- genes[genes$type == "gene",]
saveRDS(genes, "../../rObjects/annotation.rds")
}
# create sample column
barcodes <- colnames(mouse)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample)
## sample
## E3.F E3.M E4.F E4.M
## 6747 8029 11204 7408
mouse$sample <- factor(sample, levels = sample_order)
table(mouse$sample) # check
##
## E3.M E3.F E4.M E4.F
## 8029 6747 7408 11204
Idents(mouse) <- mouse$sample
# age column
mouse$age <- "14 months"
# sex column
sex <- str_match(mouse$sample, "E\\d.([FM])")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)
# Apoe isoform column
isoform <- str_match(mouse$sample, "(E\\d).[FM]")[,2]
mouse$isoform <- factor(isoform, levels = isoform_order)
# sample2 column (for title formatting)
mouse$sample2 <- paste(mouse$sex, mouse$isoform, sep = " ")
# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)
# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
## [1] "Mrpl15" "Mrpl30" "Mrps9" "Mrpl44" "Mrps14"
## [6] "Mrpl41" "Mrps2" "Mrps5" "Mrps26" "Mrps28"
## [11] "Mrpl47" "Mrpl24" "Mrpl9" "Mrps21" "Mrpl50"
## [16] "Mrpl37" "Mrps15" "Mrpl20" "Mrpl33" "Mrpl1"
## [21] "Mrps18c" "Mrps17" "Mrps33" "Mrpl35" "Mrpl19"
## [26] "Mrpl53" "Mrps25" "Mrpl51" "Mrps35" "Mrps12"
## [31] "Mrpl46" "Mrps11" "Mrpl48" "Mrpl17" "Mrpl23"
## [36] "Mrps31" "Mrpl34" "Mrpl4" "Mrps22" "Mrpl3"
## [41] "Mrpl54" "Mrpl42" "Mrps24" "Mrpl22" "Mrpl55"
## [46] "Mrps23" "Mrpl27" "Mrpl10" "Mrpl45" "Mrpl58"
## [51] "Mrps7" "Mrpl38" "Mrpl12" "Mrpl32" "Mrpl36"
## [56] "Mrps27" "Mrps36" "Mrps30" "Mrps16" "Mrpl52"
## [61] "Mrpl57" "Mrpl13" "Mrpl40" "Mrpl39" "Mrps6"
## [66] "Mrpl18" "Mrps34" "Mrpl28" "Mrps18b" "Mrpl14"
## [71] "Mrps18a" "Mrpl2" "Mrps10" "Mrpl21" "Mrpl11"
## [76] "Mrpl49" "Mrpl16" "Mrpl43" "Rpl7" "Rpl31"
## [81] "Rpl37a" "Rps6kc1" "Rpl7a" "Rpl12" "Rpl35"
## [86] "Rps21" "Rpl22l1" "Rps3a1" "Rps27" "Rpl34"
## [91] "Rps20" "Rps6" "Rps8" "Rps6ka1" "Rpl11"
## [96] "Rpl22" "Rpl9" "Rpl5" "Rplp0" "Rpl6"
## [101] "Rpl21" "Rpl32" "Rps9" "Rpl28" "Rps5"
## [106] "Rps19" "Rps16" "Rps11" "Rpl13a" "Rpl18"
## [111] "Rps17" "Rps3" "Rpl27a" "Rps13" "Rps15a"
## [116] "Rplp2" "Rpl18a" "Rpl13" "Rps25" "Rpl10-ps3"
## [121] "Rplp1" "Rpl4" "Rps27l" "Rpl29" "Rps27rt"
## [126] "Rpsa" "Rpl14" "Rps12" "Rps15" "Rpl41"
## [131] "Rps26" "Rps27a" "Rpl26" "Rpl23a" "Rpl9-ps1"
## [136] "Rps6kb1" "Rpl23" "Rpl19" "Rpl27" "Rpl38"
## [141] "Rps7" "Rpl10l" "Rps29" "Rpl36al" "Rps6kl1"
## [146] "Rps6ka5" "Rps23" "Rpl15" "Rps24" "Rpl36a-ps1"
## [151] "Rpl37" "Rpl30" "Rpl8" "Rpl3" "Rps19bp1"
## [156] "Rpl39l" "Rpl35a" "Rpl24" "Rps6ka2" "Rps2"
## [161] "Rpl3l" "Rps10" "Rpl10a" "Rps28" "Rps18"
## [166] "Rpl7l1" "Rpl36" "Rpl36-ps4" "Rps14" "Rpl17"
## [171] "Rps6kb2" "Rps6ka4" "Rpl9-ps6" "Rpl39" "Rpl10"
## [176] "Rps4x" "Rps6ka6" "Rpl36a" "Rps6ka3"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt" "Hbb-bs" "Hbb-bh2" "Hbb-bh1" "Hbb-y" "Hba-x" "Hba-a1"
## [8] "Hba-a2"
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")
ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
ggtitle("Raw: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1
# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1756 rows containing non-finite outside the scale range
## (`stat_density()`).
# normalize data for violin plots
mouse <- NormalizeData(mouse, assay = "RNA")
## Normalizing layer: counts.1
## Normalizing layer: counts.2
## Normalizing layer: counts.3
## Normalizing layer: counts.4
# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v1
# percent violins
v2 <- VlnPlot(mouse,
features = c("percent.mt","percent.ribo.protein","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v2
s1 <- ggplot(
mouse@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
s2 <- FeatureScatter(mouse,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s2
# filter
mouse.filtered <- subset(mouse,
subset = (nCount_RNA > nCount.min) &
(nCount_RNA < nCount.max) &
(nFeature_RNA > nFeature.min) &
(cell.complexity > complexity.cutoff) &
(percent.mt < mt.cutoff) &
(percent.hb < hb.cutoff))
# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "12151 cells removed"
Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.
# filter genes
mouse.filtered <- JoinLayers(mouse.filtered)
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
nonzero <- counts > 0 # produces logical
keep <- Matrix::rowSums(nonzero) >= 10 # sum the true/false
counts.filtered <- counts[keep,] # keep certain genes
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "10587 features removed"
Remove highly abundant mitochondrial transcripts.
# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "13 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero)
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")
ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1327 rows containing non-finite outside the scale range
## (`stat_density()`).
# normalize data before violin plots
mouse.filtered <- NormalizeData(mouse.filtered)
## Normalizing layer: counts
# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v3
# percent violins
v4 <- VlnPlot(mouse.filtered,
features = c("percent.mt","percent.ribo.protein","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v4
s3 <- ggplot(
mouse.filtered@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
s4 <- FeatureScatter(mouse.filtered,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s4
# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
aes(x = sample,
y = log10(nFeature_RNA),
fill=sample)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ggtitle("Unique Genes / Cell / Sample") +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Sample")
b1
df <- data.frame(gene_name = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df <- df[order(df$rsum, decreasing = TRUE),]
rownames(df) <- 1:nrow(df)
head(df, 10)
## gene_name rsum
## 1 Gm42418 11194504
## 2 Malat1 3490427
## 3 Cmss1 741422
## 4 Tmsb4x 732027
## 5 S100a9 630464
## 6 Camk1d 590094
## 7 S100a8 456591
## 8 Cd74 433221
## 9 Igkc 426296
## 10 AY036118 396173
markers <- "../../../functions/cell_cycle_markers.tsv"
path <- paste0(out, "unwanted_variation")
mouse.filtered[["phase"]] <- cell_cycle_QC(obj = mouse.filtered,
markersPath = markers,
species = "mouse",
outDir = path)
mouse.filtered[["mito.factor"]] <- mitochondria_QC(obj = mouse.filtered,
outDir = path)
# split
mouse.filtered[["RNA"]] <- split(mouse.filtered[["RNA"]],
f = mouse.filtered$sample)
# transform
mouse.filtered <- SCTransform(mouse.filtered, verbose = FALSE)
# run PCA on the merged object
mouse.filtered <- RunPCA(object = mouse.filtered, assay = "SCT")
## Warning in PrepDR(object = object, features = features, verbose = verbose): The
## following 13 features requested have not been scaled (running reduction without
## them): Ighg1, Rag1, Igll1, Prss34, Gm30948, Igkv6-15, Gm30613, Slc26a4,
## Atp6v1c2, Fut9, Ighv1-80, Igkv8-19, Tnn
## PC_ 1
## Positive: Mrc1, Cd74, Lyz2, F13a1, Slc9a9, Arhgap15, Bank1, S100a9, H2-Eb1, C1qa
## S100a8, Slc8a1, Apoe, H2-Aa, H2-Ab1, Skap1, Mctp1, C1qb, Pid1, Ptprc
## Inpp4b, C1qc, Rbpj, Pf4, Kcnq5, Rnf150, Zeb2, Dock2, Csf1r, Ctsc
## Negative: Gpc6, Flt1, Ptprg, Ldb2, Foxp2, Bnc2, Cdh11, Slc4a10, Ptprb, Igfbp5
## Pcdh7, Auts2, Ptprm, Plcb1, Mir100hg, Prkg1, Plpp3, Cacna1c, Cyyr1, Col12a1
## Tenm3, Igfbp7, Emcn, Ptprd, Nfib, Mecom, Stox2, 9530026P05Rik, Pdgfd, Adamts9
## PC_ 2
## Positive: Gpc6, Cdh11, Foxp2, Slc4a10, Bnc2, Igfbp5, Col12a1, Tenm3, Mir100hg, Cacna1c
## Col1a2, Ptprd, Slit2, Zfhx4, Slc38a2, Epha3, Prrx1, Lama2, Pdzrn3, Col1a1
## Egfr, Slc47a1, Ank2, Tspan11, Boc, Map1b, Naaladl2, Eya2, Snhg11, Adam12
## Negative: Flt1, Ldb2, Ptprg, Ptprb, Plcb1, Cyyr1, Emcn, Igfbp3, Adgrl4, Kdr
## Ptprm, Stox2, Plpp1, Podxl, Plvap, Mecom, Prex2, Arl15, Col4a1, Adgrf5
## Dysf, Igfbp7, Pecam1, Sema6a, Col4a2, Samd12, Rapgef4, Hmcn1, Insr, Etl4
## PC_ 3
## Positive: Mrc1, F13a1, Slc9a9, Apoe, C1qa, Pid1, Dab2, Slc8a1, Rbpj, Rnf150
## Cd74, Stab1, C1qb, Pf4, C1qc, Ctsc, Zdhhc14, Csf1r, Ccl8, Bank1
## H2-Eb1, Cd163, Adgre1, Maf, H2-Aa, Zeb2, Itsn1, Fcrls, Ms4a7, H2-Ab1
## Negative: S100a9, S100a8, Retnlg, Lcn2, Mmp8, Wfdc21, Ngp, Pglyrp1, Ltf, S100a6
## Camp, Hp, Mmp9, Ifitm6, Anxa1, Cxcr2, Cd177, Gda, Hdc, Pygl
## Gsr, S100a11, Abca13, Mxd1, Chil3, Slfn4, Csf3r, Dach1, Ly6g, Slpi
## PC_ 4
## Positive: Kcnq5, Skap1, Inpp4b, Aff3, Pax5, Bach2, Ikzf3, Tmem163, Cd79a, St6galnac3
## Il7r, Ms4a1, Ebf1, Ly6d, Tox, Agbl1, Cacna1e, Itk, Il1rl1, Prkcq
## Arhgap24, Bcl11b, Dach2, Bcl11a, Cd79b, Gata3, Gm2682, Bcl2, Chst3, Vpreb3
## Negative: Mrc1, F13a1, Lyz2, Apoe, C1qa, Pid1, Dab2, Slc8a1, Slc9a9, Pf4
## C1qb, Mctp1, C1qc, Stab1, Rbpj, Rnf150, Ccl8, Csf1r, Aoah, Cd163
## Adgre1, Trf, Ctsc, Fcrls, Ms4a7, Selenop, S100a9, Itsn1, Ctsb, S100a8
## PC_ 5
## Positive: Skap1, Inpp4b, St6galnac3, Il1rl1, Kcnq5, Tox, Dach2, Itk, Il7r, Prkcq
## Bcl11b, Gata3, Gm2682, Tnik, Themis, Large1, Cd226, Camk4, Ikzf2, 1700113H08Rik
## Plxdc2, Icos, Ptpn13, Ms4a4b, Chn2, Pard3, Rnf128, Cd247, Plcl1, Pde7b
## Negative: Pax5, Aff3, Cd79a, Tmem163, Bank1, Ms4a1, Ebf1, Bach2, Ly6d, Agbl1
## Arhgap24, Cd79b, Bcl11a, Cacna1e, Vpreb3, Chst3, Klhl14, Cecr2, Ikzf3, Gm30211
## Btla, Iglc3, Ralgps2, Iglc2, Fcmr, Siglecg, Gm15987, Pou2af1, Cd74, Ighm
# Reset idents and levels
DefaultAssay(mouse.filtered) <- "SCT"
Idents(mouse.filtered) <- "sample"
mouse.filtered$sample <- factor(mouse.filtered$sample,
levels = sample_order)
mouse.filtered$sex <- factor(mouse.filtered$sex,
levels = sex_order)
mouse.filtered$isoform <- factor(mouse.filtered$isoform,
levels = isoform_order)
# Plot PCA
pca1 <- DimPlot(mouse.filtered,
reduction = "pca",
split.by = "sample",
group.by = "sample",
cols = sample_colors,
ncol = 2)
pca1
pca2 <- DimPlot(mouse.filtered,
reduction = "pca",
split.by = "isoform",
group.by = "isoform",
cols = isoform_colors)
pca2
pca3 <- DimPlot(mouse.filtered,
reduction = "pca",
split.by = "sex",
group.by = "sex",
cols = sex_colors)
pca3
pca4 <- DimPlot(mouse.filtered,
reduction = "pca",
group.by = "sample",
cols = sample_colors,
shuffle = TRUE,
raster = FALSE)
pca4
e1 <- ElbowPlot(mouse.filtered) +
geom_vline(xintercept = 15, linetype = "dashed", colour = "red")
e1
# integrate
mouse.filtered <- IntegrateLayers(mouse.filtered,
method = "HarmonyIntegration",
assay = "SCT",
orig.reduction = "pca")
## Transposing data matrix
## Using automatic lambda estimation
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony converged after 4 iterations
# re-join layers
mouse.filtered[["RNA"]] <- JoinLayers(mouse.filtered[["RNA"]])
hrm1 <- DimPlot(mouse.filtered,
reduction = "harmony",
split.by = "sample",
group.by = "sample",
cols = sample_colors,
ncol = 2)
hrm1
hrm2 <- DimPlot(mouse.filtered,
reduction = "harmony",
split.by = "isoform",
group.by = "isoform",
cols = isoform_colors)
hrm2
hrm3 <- DimPlot(mouse.filtered,
reduction = "harmony",
split.by = "sex",
group.by = "sex",
cols = sex_colors)
hrm3
hrm4 <- DimPlot(mouse.filtered,
reduction = "harmony",
group.by = "sample",
cols = sample_colors,
shuffle = TRUE,
raster = FALSE)
hrm4
# Plot elbow
mouse.filtered@reductions$harmony@stdev <-
apply(mouse.filtered@reductions$harmony@cell.embeddings, 2, sd)
e1 <- ElbowPlot(mouse.filtered,
reduction = "harmony") +
geom_vline(xintercept = 15, linetype = "dashed", color = "red")
e1
To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
# run UMAP
mouse.filtered <- RunUMAP(mouse.filtered,
dims = 1:15,
reduction = "harmony",
assay = "SCT",
n.components = 3) # set to 3 to use with VR
## 15:50:35 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:50:35 Read 21237 rows and found 15 numeric columns
## 15:50:35 Using Annoy for neighbor search, n_neighbors = 30
## 15:50:35 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:50:38 Writing NN index file to temp file /tmp/RtmpborP80/file278b265477b7d4
## 15:50:38 Searching Annoy index using 1 thread, search_k = 3000
## 15:50:46 Annoy recall = 100%
## 15:50:47 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 15:50:49 Initializing from normalized Laplacian + noise (using RSpectra)
## 15:50:50 Commencing optimization for 200 epochs, with 877344 positive edges
## 15:50:59 Optimization finished
# plot UMAP
DimPlot(mouse.filtered,
cols = sample_colors,
shuffle = TRUE)
# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.filtered,
assay = "SCT", # set as default after SCTransform
reduction = "harmony",
dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
algorithm = 1, # 1 = Louvain
resolution = seq(0.1, 0.5, by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 21237
## Number of edges: 656664
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9777
## Number of communities: 12
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 21237
## Number of edges: 656664
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9677
## Number of communities: 15
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 21237
## Number of edges: 656664
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9601
## Number of communities: 18
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 21237
## Number of edges: 656664
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9534
## Number of communities: 21
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 21237
## Number of edges: 656664
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9475
## Number of communities: 26
## Elapsed time: 2 seconds
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.5
# 0.1
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE)
# 0.2
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
# 0.3
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.3",
label = TRUE)
# 0.4
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.4",
label = TRUE)
# 0.5
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.5",
label = TRUE)